<?php

namespace DataCube\DataCubeAggregation\Functions\MachineLearning\Clustering;

class Kmeans
{
    #
    # $Id$
    #
    # $k - the number of clusters; duh
    # $input - the array of values/arrays/objects that needs to be clustered
    # $attribute - (optional) the key to cluster the objects/arrays on
    # $index - the index (in the $input array) of a particular value
    # $values - the mapping of $index to $attribute (or value)
    #
    # $cluster_map - mapping of array indexes to cluster numbers
    # $clusters - an actual grouping of $index => $values into clusters (arrays)
    # $centroids - the array of centroid values for clusters
    #

    #
    # the function that actually does our shit
    #
    public function kMeans(&$input, $k, $attribute = null)
    {
        if (empty($input)) {
            return array();
        }

        #
        # if we're dealing with scalars, then just take them as is; otherwise,
        # extract just the values of interest
        #
        $values = $attribute ? $this->kmeans_values($input, $attribute) : $input;

        # setup
        $cluster_map = array();
        $centroids = $this->kmeans_initial_centroids($values, $k);

        #
        # warning: this is recursive...
        #
        $clusters = $this->kmeans_cluster($values, $cluster_map, $centroids);

        return $attribute ? $this->kmeans_rebuild($input, $clusters, $attribute) : $clusters;
    }

    #
    # perform the actual clustering
    # CHANGELOG: YIYANgpt IMPROVE THIS CODE
    #
    public function kmeans_cluster(&$values, &$cluster_map, &$centroids, $max_iterations = 100, $min_centroid_diff = 1e-6)
    {
        $num_changes = 0;
        $iteration = 0;

        do {
            $num_changes = 0;
            $prev_centroids = $centroids; // 用于收敛判断  

            foreach ($values as $index => $value) {
                $min_distance = null;
                $new_cluster = null;
                foreach ($centroids as $cluster_index => $centroid) {
                    $distance = abs($value - $centroid);
                    if (is_null($min_distance) || $min_distance > $distance) {
                        $min_distance = $distance;
                        $new_cluster = $cluster_index;
                    }
                }
                if (!isset($cluster_map[$index]) || $new_cluster != $cluster_map[$index]) {
                    $num_changes++;
                }
                $cluster_map[$index] = $new_cluster;
            }

            $clusters = $this->kmeans_populate_clusters($values, $cluster_map);
            $centroids = $this->kmeans_recalculate_centroids($clusters, $centroids);

            // 收敛判断：检查聚类中心是否有显著变化  
            $centroid_diff = 0;
            foreach ($centroids as $index => $centroid) {
                $centroid_diff += abs($centroid - $prev_centroids[$index]);
            }
            if ($centroid_diff < $min_centroid_diff) { // 可以根据实际需求调整阈值  
                break;
            }

            $iteration++;
        } while ($num_changes > 0 && $iteration < $max_iterations);

        return $clusters;
    }

    #
    # figure out centroids (means) for the clusters as they are
    #
    public function kmeans_recalculate_centroids($clusters, $centroids)
    {

        foreach ($clusters as $cluster_index => $cluster) {
            $cluster_values = array_values($cluster);
            $count = count($cluster_values);
            $mean = $count ? array_sum($cluster_values) / $count : 0;
            if ($centroids[$cluster_index] != $mean) {
                $centroids[$cluster_index] = $mean;
            }
        }

        return $centroids;
    }

    #
    # set up some reasonable defaults for centroid values
    #
    public function kmeans_initial_centroids(&$values, $k)
    {
        $centroids = array();
        $max = max($values);
        $min = min($values);
        $interval = ceil(($max - $min) / $k);

        while (0 <= --$k) {
            $centroids[$k] = $min + $interval * $k;
        }

        return $centroids;
    }

    #
    # in the event that we're dealing with an array of objects, extract just a
    # key => value of interest mapping first
    #
    public function kmeans_values(&$input, $attribute)
    {
        $values = array();

        foreach ($input as $index => $value) {
            $value = (array)$value;
            $values[$index] = $value[$attribute];
        }

        return $values;
    }

    #
    # convert the $index => $cluster_index map to a $cluster_index => $cluster map
    # ($cluster is a $index => $value mapping)
    #
    public function kmeans_populate_clusters(&$values, &$cluster_map)
    {
        $clusters = array();
        foreach ($cluster_map as $index => $cluster) {
            $clusters[$cluster][$index] = $values[$index];
        }

        return $clusters;
    }

    #
    # if we're dealing with non-scalars, re-attach the actual objects to their
    # indexes in the clusters, and populate the objects with useful cluster info
    #
    public function kmeans_rebuild(&$input, &$clusters, $attribute)
    {
        if ($attribute) {
            $cluster_key = "cluster_{$attribute}";
            $cluster_size_key = "cluster_size_{$attribute}";
            $clusters_rebuilt = array();
            foreach ($clusters as $cluster_index => $cluster) {
                $cluster_size = count($cluster);
                foreach ($cluster as $index => $value) {
                    if (is_array($input[$index])) {
                        $input[$index][$cluster_key] = $cluster_index;
                        $input[$index][$cluster_size_key] = $cluster_size;
                    } else {
                        $input[$index]->$cluster_key = $cluster_index;
                        $input[$index]->$cluster_size_key = $cluster_size;
                    }
                    $clusters_rebuilt[$cluster_index][$index] = $input[$index];
                }
            }
        } else {
            $clusters_rebuilt = $clusters;
        }

        return $clusters_rebuilt;
    }
}
